####################################
#Diff-in-diff with matching and dual-distance cutoffs - function
#Traffic and Political Attitudes
#Connor Jerzak & Brian Libgober
####################################

dd_with_dd <- function(my_data, #data frame
                       treatment_basis_name, #character string
                       control_basis_name, #character vector
                       outcome_time1_name, #character string
                       outcome_time2_name, #character string
                       matching_variables, #character vector
                       treatment_basis_treated_max, #numeric
                       treatment_basis_control_min, #numeric
                       control_basis_control_max, #numeric
                       control_basis_treated_min, #numeric
                       use_controls = F,
                       boostrap_SDs=F,
                       bootstrap_numb=NULL,
                       conf_level=0.95,
                       match_quietly=T,
                       my_caliper = 0,
                       start_browser=F,
                       my_ratio = 1,
                       my_method = "nearest",
                       my_distance="mahalanobis",
                       block = F,
                       use_match = T,
                       my_discard="none",
                       my_m.order="largest",
                       my_replace=T) #logical
{#begin function brackets
  require(MatchIt, quietly=T)

  if(start_browser==T){browser()}

  #get treatment basis into memory
  treatment_basis <- eval(parse(text=paste("my_data$", treatment_basis_name, "")))

  #get control basis into memory
  control_basis <- eval(parse(text=paste("my_data$", control_basis_name, "")))

  #determine treated_indicator
  my_data <- cbind(my_data, NA)
  colnames(my_data)[ncol(my_data)] <- "treated_indicator"
  my_data$treated_indicator[f2n(treatment_basis) <= treatment_basis_treated_max &
                              f2n(control_basis) >= control_basis_treated_min] <- 1
  my_data$treated_indicator[f2n(control_basis) <= control_basis_control_max &
                              f2n(treatment_basis) >= treatment_basis_control_min] <- 0

  #delete variable with is.na(treated_indicator)=T
  matching_variables_2009 <- gsub(matching_variables, pattern = "2000", replace = "2009")
  matching_variables_2009 <- matching_variables_2009[matching_variables_2009 %in% colnames(my_data)]
  keep_columns <- colnames(my_data) %in% c(matching_variables,
                                             matching_variables_2009,
                                             "County",
                                             outcome_time1_name,
                                             outcome_time2_name,
                                             "treated_indicator")
  my_data <- my_data[,keep_columns]
  my_data <- my_data[is.na(my_data$treated_indicator)==F,]

  #create matching formula
  matching_formula <- paste("treated_indicator ~", paste(matching_variables, collapse="+"))

  #create estimation_function
  match_function <- function(match_input)
  {
      #create matched dataset
      if(my_method=="genetic")
      {
        matched_results <- matchit(as.formula(matching_formula), data = match_input,
                               method = "genetic",
                               distance = my_distance,
                               discard = my_discard,
                               replace = my_replace)
      }

      if(my_method=="nearest")
      {
        matched_results <- matchit(as.formula(matching_formula), data = match_input,
                                 method = "nearest",
                                 distance= my_distance,
                                 discard= my_discard,
                                 caliper = my_caliper,
                                 m.order = my_m.order,
                                 reestimate = T,
                                 ratio = my_ratio,
                                 replace = my_replace)
      }
      else
      {
        matched_results <- matchit(as.formula(matching_formula), data = match_input,
                                   method = my_method,
                                   distance= my_distance,
                                   discard= my_discard,
                                   m.order=my_m.order,
                                   caliper = my_caliper,
                                   replace=my_replace)
      }


      #obtain matched dataset
      matched_dataset <- match.data(matched_results)

      return(list(matched_dataset=matched_dataset,
                  match_metadata=matched_results))
  }

  dd_calculator <- function(input_matched_dataset)
  {
      mean_treat_time1 <- mean(as.numeric(as.character(
                          input_matched_dataset$weights[input_matched_dataset$treated_indicator==1] *
                              input_matched_dataset[input_matched_dataset$treated_indicator==1,which(colnames(input_matched_dataset)==outcome_time1_name)])),
                                          na.rm=T)
      mean_treat_time2 <- mean(as.numeric(as.character(
          input_matched_dataset$weights[input_matched_dataset$treated_indicator==1] *
                      input_matched_dataset[input_matched_dataset$treated_indicator==1,which(colnames(input_matched_dataset)==outcome_time2_name)])),
                                          na.rm=T)

      mean_control_time1 <- mean(as.numeric(as.character(
          input_matched_dataset$weights[input_matched_dataset$treated_indicator==0] *
                input_matched_dataset[input_matched_dataset$treated_indicator==0,which(colnames(input_matched_dataset)==outcome_time1_name)])),
                                            na.rm=T)
      mean_control_time2 <- mean(as.numeric(as.character(
          input_matched_dataset$weights[input_matched_dataset$treated_indicator==0] *
                input_matched_dataset[input_matched_dataset$treated_indicator==0,which(colnames(input_matched_dataset)==outcome_time2_name)])),
                                              na.rm=T)

      diff_in_diff_estimate <- (mean_treat_time2 - mean_treat_time1) - (mean_control_time2 - mean_control_time1)
      return(diff_in_diff_estimate)
  }

  dd_calculator_with_controls <- function(input_matched_dataset)
  {
    lm_summary <- summary(lm(as.formula(
      sprintf("regression_outcome~treated_indicator+post_indicator+treated_indicator:post_indicator+%s",
            paste(paste("County", collapse="+"),
              control_variables, sep="+",
                  collapse="+"))),
      data=input_matched_dataset))

    if(T == F){
      print("No county fixed effects.")
      lm_summary <- summary(lm(as.formula(
        sprintf("regression_outcome~treated_indicator+post_indicator+treated_indicator:post_indicator+%s",
                paste(control_variables, sep="+",
                      collapse="+"))),
        data=input_matched_dataset))
    }
    return(lm_summary)
  }
  #initial investigation of non-matched data

  my_data[,!colnames(my_data) %in% "County"] <- apply(my_data[,!colnames(my_data) %in% "County"], 2, f2n)

  if(use_match == F)
  {
    match_raw_return <- match_numb <- balance_table <- NA
    matched_dataset_return <- my_data
    matched_dataset_return$weights <- 1
    matched_dataset <- matched_dataset_return
  }

  if(use_match == T)
  {

  #create matched dataset
  match_input_return <- my_data
  match_input_return <- as.data.frame(  na.omit( match_input_return ) )
  match_raw_return <- match_function(match_input=match_input_return)
  matched_dataset <- matched_dataset_return <-  match_raw_return$matched_dataset

  #store metadata
  match_numb <- nrow(matched_dataset_return)
  balance_table <- summary(match_raw_return$match_metadata)
  }

  #calculate base result
  if(use_controls == F){  base_result <- dd_calculator(matched_dataset_return) }
  if(use_controls == T)
  {
    ##prep for regression
    prep1 <- cbind(matched_dataset_return, 0, matched_dataset_return[outcome_time1_name])
    prep2 <- cbind(matched_dataset_return, 1, matched_dataset_return[outcome_time2_name])
    colnames(prep1)[ncol(prep1)-1] <- "post_indicator"; colnames(prep1)[ncol(prep1)] <- "regression_outcome"
    colnames(prep2)[ncol(prep2)-1] <- "post_indicator"; colnames(prep2)[ncol(prep2)] <- "regression_outcome"
    prep_reg <- rbind(prep1, prep2)

    matching_variables_without_year <- gsub(matching_variables, pattern = "2000_", replace = "")
    matching_variables_2009_without_year <- gsub(matching_variables_2009, pattern = "2009_", replace = "")
    control_variables <- intersect(matching_variables_2009_without_year, matching_variables_without_year)
    control_variables <- control_variables[!control_variables %in% gsub(outcome_time1_name, pattern = "_2000_", replace = "_")]

    control_2000_df <- matched_dataset_return[,gsub(control_variables, pattern = "census_", replace = "census_2000_")]
    control_2009_df <- matched_dataset_return[,gsub(control_variables, pattern = "census_", replace = "census_2009_")]
    colnames(control_2000_df) <- colnames(control_2009_df) <- control_variables
    control_variables_df <- rbind(control_2000_df,
                                  control_2009_df)
    prep_reg <- cbind(prep_reg,  control_variables_df)

    lm_result <- dd_calculator_with_controls(  prep_reg   )
    base_result <- lm_result[[4]][nrow(lm_result[[4]]),][1]
  }

  #with bootstrap SDs
  lower <- uppper <- NULL
  if(boostrap_SDs==T)
  {
    #generate the bootstrap indices
    if(block == F){
      #find the number of rows
      my_nrows <- nrow(matched_dataset)
      bootstrap_indices <- replicate(n=bootstrap_numb, expr=sample(1:my_nrows, size=my_nrows, replace=T))
    }
    if(block == T)
    {
      match_input <- matched_dataset
      match_input <- as.data.frame(  na.omit( match_input ) )

      unique_counties <- names(table(match_input$County))
      counties_with_treatment <- unique( match_input$County[ which( match_input$treated_indicator == 1) ]  )
      counties_with_control <- unique( match_input$County[ which( match_input$treated_indicator == 0) ]  )
      my_expression <- function()
      {
        counties_boot <- as.character(  sample(counties_with_treatment, 1) )  #make sure we have at least one treated county
        counties_boot <- append(counties_boot, as.character( sample(counties_with_control, 1)) ) #and one control county
        counties_boot <- append(counties_boot, as.character( sample(unique_counties,
                                                      length(unique_counties) - 2,
                                                      replace = T)  )  )
        boot_indices <- c()
        for(county_j in 1:length(counties_boot))
        {
         boot_indices <- append(boot_indices, which(match_input$County %in% counties_boot[county_j]))
        }
        return(boot_indices)
      }
      bootstrap_indices_list <- list()
      for(boot_i in 1:bootstrap_numb)
      {
        bootstrap_indices_list[[boot_i]] <- my_expression()
      }
    }

    #perform the bootstrap, using apply
    if(block == F & use_controls ==F){
      bootstrap_replications <- apply(X=bootstrap_indices, MARGIN=2, function(x){
        matched_dataset_input <-  try(matched_dataset[unlist(x),], T)
        return_results <- try(dd_calculator(matched_dataset_input), T)
        if(class(return_results) == "try-error"){return_results <- NA}
        return( return_results  )
        })
    }
    if(block == T & use_controls == F){
      bootstrap_replications <- lapply(bootstrap_indices_list, function(x){
        matched_dataset_input <-  try(matched_dataset[unlist(x),], T)
        return_results <- try(dd_calculator(matched_dataset_input), T)
        if(class(return_results) == "try-error"){return_results <- NA}
        return( return_results  )
      })
    }
    if(block == T & use_controls == T)
    {
      bootstrap_replications <- lapply(bootstrap_indices_list, function(x){

      ##prep for regression
      my_data_temp <- matched_dataset[unlist(x),]
      prep1 <- cbind(my_data_temp, 0, my_data_temp[outcome_time1_name])
      prep2 <- cbind(my_data_temp, 1, my_data_temp[outcome_time2_name])
      colnames(prep1)[ncol(prep1)-1] <- "post_indicator"; colnames(prep1)[ncol(prep1)] <- "regression_outcome"
      colnames(prep2)[ncol(prep2)-1] <- "post_indicator"; colnames(prep2)[ncol(prep2)] <- "regression_outcome"
      prep_reg <- rbind(prep1, prep2)

      matching_variables_without_year <- gsub(matching_variables, pattern = "2000_", replace = "")
      matching_variables_2009_without_year <- gsub(matching_variables_2009, pattern = "2009_", replace = "")
      control_variables <- intersect(matching_variables_2009_without_year, matching_variables_without_year)
      control_variables <- control_variables[!control_variables %in% gsub(outcome_time1_name, pattern = "_2000_", replace = "_")]

      control_2000_df <- my_data_temp[,gsub(control_variables, pattern = "census_", replace = "census_2000_")]
      control_2009_df <- my_data_temp[,gsub(control_variables, pattern = "census_", replace = "census_2009_")]
      colnames(control_2000_df) <- colnames(control_2009_df) <- control_variables
      control_variables_df <- rbind(control_2000_df,
                                    control_2009_df)
      prep_reg <- cbind(prep_reg,  control_variables_df)

      return_result <- NA
      lm_result <- try(dd_calculator_with_controls(  prep_reg   ) , T)
      return_result_temp <- try(lm_result[[4]][nrow(lm_result[[4]]),][1], T)
      if(class(return_result_temp) != "try-error"){return_result <- return_result_temp}

      return(return_result)
    } )
    }

    if(block == F & use_controls == T)
    {
      bootstrap_replications <- apply(bootstrap_indices, 2, function(x){

        ##prep for regression
        my_data_temp <- matched_dataset[unlist(x),]

        if("try-error" %in% class(my_data_temp)){ return_result <- NA }
        if(!"try-error" %in% class(my_data_temp)){
        prep1 <- cbind(my_data_temp, 0, my_data_temp[outcome_time1_name])
        prep2 <- cbind(my_data_temp, 1, my_data_temp[outcome_time2_name])
        colnames(prep1)[ncol(prep1)-1] <- "post_indicator"; colnames(prep1)[ncol(prep1)] <- "regression_outcome"
        colnames(prep2)[ncol(prep2)-1] <- "post_indicator"; colnames(prep2)[ncol(prep2)] <- "regression_outcome"
        prep_reg <- rbind(prep1, prep2)

        matching_variables_without_year <- gsub(matching_variables, pattern = "2000_", replace = "")
        matching_variables_2009_without_year <- gsub(matching_variables_2009, pattern = "2009_", replace = "")
        control_variables <- intersect(matching_variables_2009_without_year, matching_variables_without_year)
        control_variables <- control_variables[!control_variables %in% gsub(outcome_time1_name, pattern = "_2000_", replace = "_")]

        control_2000_df <- my_data_temp[,gsub(control_variables, pattern = "census_", replace = "census_2000_")]
        control_2009_df <- my_data_temp[,gsub(control_variables, pattern = "census_", replace = "census_2009_")]
        colnames(control_2000_df) <- colnames(control_2009_df) <- control_variables
        control_variables_df <- rbind(control_2000_df,
                                      control_2009_df)
        prep_reg <- cbind(prep_reg,  control_variables_df)

        return_result <- NA
        lm_result <- try(dd_calculator_with_controls(  prep_reg   ) , T)
        return_result_temp <- try(lm_result[[4]][nrow(lm_result[[4]]),][1], T)
        if(class(return_result_temp) != "try-error"){return_result <- return_result_temp}
        }
        return(return_result)
      })
    }

    #find the 95% quantiles
    lower <- quantile(unlist(bootstrap_replications), (1-conf_level)/2, na.rm=T)
    upper <- quantile(unlist(bootstrap_replications), (1+conf_level)/2, na.rm=T)
  }

  return_list <- list(  point_est = base_result,
                        lower = lower,
                        upper = upper,
                        boot_sd = sd(unlist(bootstrap_replications), na.rm = T),
                        match_numb = match_numb,
                        matched_dataset = list(matched_dataset_return),
                        raw_dataset = list(my_data),
                        match_output = match_raw_return,
                        balance_table = list( balance_table) )
  return(return_list)
}#end function barkets

